#UNDERSTAND THE BUSINESS
#It is important for entrepreneurs to know how likely a project is going to be "sucessful" if launched on Kickstarter
#possible factors include not meeting minimum funding goal or the category of the product/service.
#This project analyzes statistics to find correlation among 13 features of a project
#to predict a likelihood of a project to be succesful on Kickstarter - that is fully funded
#Import data set from url: https://www.kaggle.com/kemical/kickstarter-projects/data
#data in 2018
url1 <- "/Users/tamvu/DS4100/Kickstarter Predict/Startup Prediction/kickstarter-projects/ks-projects-201801.csv"
raw_df <- as.data.frame(read.csv(url1), na.strings = c("", "NA", "undefined"))
#original size is 350k+ which is too big data. I will take a sample 30%
raw_df <- raw_df[sample(nrow(raw_df) * 0.4), ]
library(RMySQL)
#CONNECT TO MYSQL DATABASE
awsURL <- "mypersonalinstance.cwoxgifwds2r.us-east-1.rds.amazonaws.com"
con <- dbConnect(MySQL(),
user = "vut1307",
password = "Vuminhtam1307",
host = awsURL,
port = 3306,
dbname = "kickstarterprojects")
#import data into the database
dbWriteTable(con, name = 'dataset', raw_df, overwrite = TRUE)
[1] TRUE
-- RETRIEVE DATA FROM DATABASE
SELECT * FROM dataset;
#UNDERSTAND THE DATA
#There are over 350k entries recorded since April 2009 to December 2017. Total 15 variables
#Since ID and names are indentity of the data rather than a factor that determines any outcome,
#one of the variable is the actual outcome of the data: sucessful/failed/canceled/live.
#the rest are features that can contribute to the state
#DEFINE OUTCOME "sucessful":
#states of a project recorded into the dataset
df <- raw_df
table(df$state)
canceled failed live successful suspended undefined
15548 78797 1127 53875 707 1410
#I am only taking into account projects that are not live or suspended (because there is no result to it yet if so)
#projects that are not canceled (because the result is not one of the features in the dataset)
#only projects that is determined with final state (either failed/sucessful)
#EXPLORATORY DATA ANALYSIS: analyze to find important factors that indicate the outcome of successful or failed projects
#percentage of sucessful vs failed
n <- nrow(subset(df, df$state == "successful" | df$state == "failed"))
mytable <- as.data.frame(table(df$state) / n * 100)
slices <- c(59.33, 40.67)
lbls <- paste(c( "failed", "successful"), slices, "%")
pie(slices, labels = lbls, main="Pie Chart of Sucessful projects")

#RETRIEVE DATA TO EXPLORE
#What is the trendiest product being proposed on Kickstarter in the sample?
dbGetQuery(con, '
SELECT main_category AS most_popular_category,
AVG(backers) AS num_backers,
AVG(usd_goal_real) AS avg_goal,
AVG(usd_pledged_real) AS avg_pledged,
(SELECT country FROM dataset WHERE state = "successful" GROUP BY country ORDER BY count(*) DESC LIMIT 1) AS most_by_country,
COUNT(*) AS number_proposed FROM dataset
GROUP BY main_category
ORDER BY number_proposed DESC LIMIT 1;
')
Decimal MySQL column 1 imported as numeric
#What is the trendiest product being proposed on Kickstarter that is sucessful?
#What is the trendiest product being proposed on Kickstarter that is failed?
dbGetQuery(con, '
SELECT * FROM
(SELECT state, main_category AS most_popular_category,
AVG(backers) AS num_backers,
AVG(usd_goal_real) AS avg_goal,
AVG(usd_pledged_real) AS avg_pledged, COUNT(*) AS number_proposed
FROM dataset WHERE state = "successful"
GROUP BY main_category ORDER BY number_proposed DESC LIMIT 1) good
UNION
(SELECT state, main_category AS most_popular_category,
AVG(backers) AS num_backers,
AVG(usd_goal_real) AS avg_goal,
AVG(usd_pledged_real) AS avg_pledged, COUNT(*) AS number_proposed
FROM dataset WHERE state = "failed"
GROUP BY main_category ORDER BY number_proposed DESC LIMIT 1);
')
Decimal MySQL column 2 imported as numeric
#as we can see from queries the most popular projects is Film and Video but they are most likely to fail.
#What is a good ammount of goal for a project to be less likely to fail? MUCH LOWER THAN THE FAILED PROJECTS
#Compare the successful projects compared to failed projects with same amount of backers but lower goals than average bad goals?
dbGetQuery(con, '
SELECT D1.main_category, D1.goal AS good_goal, D.avg_fail_goal FROM dataset D1
JOIN (SELECT *, AVG(goal) AS avg_fail_goal FROM dataset D2 WHERE D2.state = "failed") D
ON D1.backers = D.backers AND D1.goal < D.avg_fail_goal
WHERE D1.state = "successful"
ORDER BY good_goal DESC;
')
-- Get the projects that is active till most recently.
SELECT * FROM dataset WHERE state = "live" AND YEAR(deadline) = 2018 ORDER BY month(deadline) DESC;
#DATA PREPARATION: CLEANING, IMPUTATION
#raw data has 113 entries.
#MISSING VALUES: disregard all entries that have any missing values of any variable
#which results in 112k entries with total 15 features
df <- na.omit(df)
#CONSTRUCT DATA: calculate the duration of the project
#I want to disregard the effect of actual time because the result will depend on the events have actually happen at the time.
df$duration <- as.numeric(as.Date(df$deadline) - as.Date(df$launched))
#SHAPE DATA:DUMMY CODE for the catergories
#1 - Art, 2 - Comics, 3 - Crafts, 4 - Dance, 5 - Design, 6 - Fashion, 7 - Film & Video,
#8 - Food, 9 - Games, 10 - Journalism, 11 - Music, 12 - Photography, 13 - Publishing, 14 - Technology
df$encode_category <- as.numeric(df$main_category)
#DUMMY CODE FOR state: successful = 1, failed = 0
df$encode_state <- as.numeric(df$state)/2 - 1
#DUMMY CODE FOR country
df$encode_country <- as.numeric(df$country)
#CLEAN DATA to sucessful/failed only.
#I am only taking into account projects that are not live or suspended (because there is no result to it yet if so)
#projects that are not canceled (because the result is not one of the features in the dataset)
#only projects that is determined with final state (either failed/sucessful)
# results in 331465 entries
df <- subset(df, df$state == "failed" | df$state == "successful")
#OMIT IRRELEVANT variables: ID and names. results in 13 variables: 12 features and 1 that is the state
#I also omit (sub)category and only consider main_category to be more general
#Omit time variables: deadline and launched
#Money variable like goals and pledged I am choosing the columns in USD only so ommit currency and general goal and pledged columns
nonvars <- c("ID","name", "category", "goal", "launched", "deadline", "pledged", "state", "currency", "main_category", "country", "usd.pledged")
df <- df[,!(names(df) %in% nonvars)]
#EXPLORE DATA: CORRELATION ANALYSIS
cor(df$usd_goal_real, df$encode_state)
[1] -0.02301614
cor(df$backers, df$encode_state)
[1] 0.1327455
cor(df$duration, df$encode_state)
[1] -0.1203274
cor(df$usd_pledged_real, df$encode_state)
[1] 0.1283374
# the sucessful projects has lower funding goals, lower duration,
# higher number of backers, and higher pledged in USD
#EXPLORATORY PLOTS - DETECT AND REMOVE OUTLIERS
#function to detetect and remove outliers
#source https://www.r-bloggers.com/identify-describe-plot-and-remove-the-outliers-from-the-dataset/
outlierKD <- function(dt, var, str) {
var_name <- eval(substitute(var),eval(dt))
tot <- sum(!is.na(var_name))
na1 <- sum(is.na(var_name))
m1 <- mean(var_name, na.rm = T)
par(mfrow=c(2, 2), oma=c(0,0,3,0))
boxplot(var_name, main="With outliers")
hist(var_name, main="With outliers", xlab=NA, ylab=NA)
outlier <- boxplot.stats(var_name)$out
mo <- mean(outlier)
var_name <- ifelse(var_name %in% outlier, NA, var_name)
boxplot(var_name, main="Without outliers")
hist(var_name, main="Without outliers", xlab=NA, ylab=NA)
title(paste("Outlier Check of", str) , outer=TRUE)
na2 <- sum(is.na(var_name))
message("Outliers identified: ", na2 - na1, " from ", tot, " observations")
message("Proportion (%) of outliers: ", (na2 - na1) / tot*100)
message("Mean of the outliers: ", mo)
m2 <- mean(var_name, na.rm = T)
message("Mean without removing outliers: ", m1)
message("Mean if we remove outliers: ", m2)
response <- "yes"
if(response == "y" | response == "yes"){
dt[as.character(substitute(var))] <- invisible(var_name)
assign(as.character(as.list(match.call())$dt), dt, envir = .GlobalEnv)
message("Outliers successfully removed", "\n")
return(invisible(dt))
} else{
message("Nothing changed", "\n")
return(invisible(var_name))
}
}
#remove outliers of number of backers
outlierKD(df,backers, "number of backers")

#remove outliers of pledged number
outlierKD(df,usd_pledged_real, "USD real pledged")

outlierKD(df,usd_goal_real, "goal in USD")

#NORMALIZATION OF DATA
#MIN-MAX NORMALIZATION of backers and duration of fundraising
min_max_norm <- function(data) {
options(scipen = 999)
minX <- min(as.numeric(data))
maxX <- max(as.numeric(data))
data <- (as.numeric(data) - minX)/(maxX - minX)
}
min_max_norm(df$backers)
min_max_norm(df$duration)
#NORMALIZATION OF DATA
#Z-SCORE STANDARDIZATION of goal and pledged
df$usd_goal_real <- scale(df$usd_goal_real)
df$usd_pledged_real <- scale(df$usd_pledged_real)
#DATA MODELING
#Create a stratified sample where you randomly select 70% of successful and failed projects to be part of the validation data set.
df <- na.omit(df)
train_Set <- data.frame()
set.seed(5)
for(i in 0:1) {
all_of_type_i <- subset(df, df$encode_state == i)
train_Set <- rbind(train_Set, all_of_type_i[sample(0.7 * nrow(all_of_type_i)),])
}
#The remaining cases will form the training data set.
test_Set <- df[!(as.numeric(row.names(df)) %in% as.numeric(row.names(train_Set))), ]
#MULTIPLE LINEAR REGRESSION MODEL
lin_model <- lm(train_Set$encode_state ~ ., data = train_Set)
summary(lin_model)
Call:
lm(formula = train_Set$encode_state ~ ., data = train_Set)
Residuals:
Min 1Q Median 3Q Max
-1.3363 -0.2551 -0.1029 0.2750 0.7867
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.26469518 0.00609693 43.415 < 0.0000000000000002 ***
backers 0.00458467 0.00006462 70.946 < 0.0000000000000002 ***
usd_pledged_real 0.17782802 0.00232103 76.616 < 0.0000000000000002 ***
usd_goal_real -0.18498998 0.00138442 -133.623 < 0.0000000000000002 ***
duration -0.00196377 0.00009581 -20.497 < 0.0000000000000002 ***
encode_category -0.00014506 0.00031003 -0.468 0.64
encode_country 0.00130859 0.00020254 6.461 0.000000000105 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.3309 on 70681 degrees of freedom
Multiple R-squared: 0.5214, Adjusted R-squared: 0.5214
F-statistic: 1.283e+04 on 6 and 70681 DF, p-value: < 0.00000000000000022
#REVISE THE LINEAR REGRESSION
lin_model <- lm(train_Set$encode_state ~ . - encode_category, data = train_Set)
summary(lin_model)
Call:
lm(formula = train_Set$encode_state ~ . - encode_category, data = train_Set)
Residuals:
Min 1Q Median 3Q Max
-1.3368 -0.2551 -0.1029 0.2750 0.7861
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.26351128 0.00554703 47.505 < 0.0000000000000002 ***
backers 0.00458482 0.00006462 70.950 < 0.0000000000000002 ***
usd_pledged_real 0.17782397 0.00232100 76.615 < 0.0000000000000002 ***
usd_goal_real -0.18502058 0.00138287 -133.795 < 0.0000000000000002 ***
duration -0.00196573 0.00009571 -20.538 < 0.0000000000000002 ***
encode_country 0.00130951 0.00020253 6.466 0.000000000101 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.3309 on 70682 degrees of freedom
Multiple R-squared: 0.5214, Adjusted R-squared: 0.5214
F-statistic: 1.54e+04 on 5 and 70682 DF, p-value: < 0.00000000000000022
#Ideal linear regression model for predicting the state of a project on Kickstarter in this data set
#include number of backers, USD pledged, USD goal, duration and country.
#All are significant with P-value < 0.05
#LOGISTIC REGRESSION MODEL
log_model <- glm(train_Set$encode_state ~ ., data = train_Set)
summary(log_model)
Call:
glm(formula = train_Set$encode_state ~ ., data = train_Set)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.3363 -0.2551 -0.1029 0.2750 0.7867
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.26469518 0.00609693 43.415 < 0.0000000000000002 ***
backers 0.00458467 0.00006462 70.946 < 0.0000000000000002 ***
usd_pledged_real 0.17782802 0.00232103 76.616 < 0.0000000000000002 ***
usd_goal_real -0.18498998 0.00138442 -133.623 < 0.0000000000000002 ***
duration -0.00196377 0.00009581 -20.497 < 0.0000000000000002 ***
encode_category -0.00014506 0.00031003 -0.468 0.64
encode_country 0.00130859 0.00020254 6.461 0.000000000105 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for gaussian family taken to be 0.1094887)
Null deviance: 16170.3 on 70687 degrees of freedom
Residual deviance: 7738.8 on 70681 degrees of freedom
AIC: 44256
Number of Fisher Scoring iterations: 2
#REVISE LOGISTIC REGRESSION MODEL
log_model <- glm(train_Set$encode_state ~ . - encode_category, data = train_Set)
summary(log_model)
Call:
glm(formula = train_Set$encode_state ~ . - encode_category, data = train_Set)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.3368 -0.2551 -0.1029 0.2750 0.7861
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.26351128 0.00554703 47.505 < 0.0000000000000002 ***
backers 0.00458482 0.00006462 70.950 < 0.0000000000000002 ***
usd_pledged_real 0.17782397 0.00232100 76.615 < 0.0000000000000002 ***
usd_goal_real -0.18502058 0.00138287 -133.795 < 0.0000000000000002 ***
duration -0.00196573 0.00009571 -20.538 < 0.0000000000000002 ***
encode_country 0.00130951 0.00020253 6.466 0.000000000101 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for gaussian family taken to be 0.1094875)
Null deviance: 16170.3 on 70687 degrees of freedom
Residual deviance: 7738.8 on 70682 degrees of freedom
AIC: 44254
Number of Fisher Scoring iterations: 2
#Ideal multiple regression model for predicting outcome of project in this data set include number of backers, USD pledged, USD goal, duration and country. All are significant with P-value < 0.05
#From both regression models, category is not a significant idicator of the outcome - which was my hypothesis when I started the project. A protential reason is that the category is only significant regarding the events/trend of the time of the project. For example waste-free trend will elevate a chance of success for a environmental-friendly product like beewax wraper (Bee's Wrap).
library('class')
#TUNING OF THE K-NN MODEL.
#Determine an optimal k by trying all values from 5 through 15 for k-NN algorithm against the cases in the validation data set.
#Source code: https://www.r-bloggers.com/using-knn-classifier-to-predict-whether-the-price-of-stock-will-increase/
min <- 2
max <- 10
accuracy <- rep(0, max-min+1) #initialize with 0 accuracy
k <- min:max
for(x in k){
prediction <- knn(train_Set, test_Set, train_Set$encode_state, k = x)
accuracy[x] <- mean(prediction == test_Set$encode_state) * 100 #calculate accuracy
}
#What is the optimal k, i.e., the k that results in the best accuracy? Plot k versus accuracy.
plot(k, accuracy[min:max], type = 'b', main = "Accuracy by k", xlab = "accuracy (%)")

#Most accurate k = 11. REVISE AND BUILD THE K-NN MODEL WITH K = 11
library('class')
knn_model <- knn(train = train_Set, test = test_Set, train_Set$encode_state, k = 3)
table(knn_model, test_Set$encode_state)
knn_model 0 1
0 13823 451
1 847 7633
# What is the percentage of correct prediction of sucessful projects?
knn_accuracy <- mean(knn_model == test_Set$encode_state) * 100
#helper function to calculate the MAD. linear and logistic regression model has same amount of coefficients
predict_value <- function(data, model, i) {
return(model$coefficients[[1]]
+ data$backers[i] * model$coefficients[[2]]
+ data$usd.pledged[i] * model$coefficients[[3]]
+ data$usd_pledged_real[i] * model$coefficients[[4]]
+ data$duration[i] * model$coefficients[[5]]
+ data$encode_country[i] * model$coefficients[[6]])
}
MAD <- function(model) {
sum <- 0
n <- nrow(df)
for(i in 1:n) {
sum <- sum + abs(predict_value(df,model, i) - df$encode_state[i])
}
return(sum)
}
#EVALUATE THE FIT OF MODELS
#Linear model:
sumr_lin <- summary(lin_model)
lin_MAD <- MAD(lin_model)
lin_MSE <- mean(summary(lin_model)$residuals^ 2)
paste("Linear Regression model has R-squared of", sumr_lin$r.squared, "which explains the model fits the training data well; has MAD of", lin_MAD, "which is actually higher than R-squared; has MSE of", lin_MSE)
[1] "Linear Regression model has R-squared of 0.522412900229395 which explains the model fits the training data well; has MAD of which is actually higher than R-squared; has MSE of 0.109393976331674"
#Logistic model
log_MAD <- MAD(log_model)
paste("Logistic Regression model has MAD of", log_MAD, "explains the model fits the training data well")
[1] "Logistic Regression model has MAD of explains the model fits the training data well"
#A function to determine if the model has bias (false positive), e.g predicting a project failed when they actually succeeded
bias <- function(model) {
prediction <- predict(model, type = "response", newdata = test_Set)
prediction <- ifelse(prediction > 0.5,1,0) #set survival = 1
combine <- data.frame(prediction, test_Set$encode_state)
#filter out all errors the model made
error <- subset(combine, combine$prediction != combine$test_Set.encode_state)
#false positive are the errors with people who are actual survived
falsePositive <- subset(error, error$test_Set.encode_state == 1)
falseNegative <- subset(error, error$test_Set.encode_state == 0)
return(c(nrow(falsePositive)/length(prediction)*100, nrow(falseNegative)/length(prediction)*100))
}
#COMPARE BIAS
falsePos <- bias(log_model)[1] #2%
falseNeg <- bias(log_model)[2] #10%
falsePos_lin <- bias(lin_model)[1] #2%
falseNeg_lin <- bias(lin_model)[2] #10%
paste("Logistic Regression Model and Linear Regression Model has more false positive. Hence they are bias towards predicting a project fail")
[1] "Logistic Regression Model and Linear Regression Model has more false positive. Hence they are bias towards predicting a project fail"
#function get_accuracy: takes in a model and output the percentage of how correct the prediction
get_accuracy <- function(name, model) {
prediction <- predict(model, type = "response", newdata = test_Set)
prediction <- ifelse(prediction > 0.5,1,0) #set survival = 1
#and determine its prediction accuracy (as a percentage correct).
#by comparing the prediction to the test data set survival result
correctPrediction <- mean(prediction == test_Set$encode_state)
return(paste('Prediction Accuracy of', name, 'is', correctPrediction * 100, '%'))
}
#COMPARE THE ACCURACY OF MODELS
paste(get_accuracy("Multiple Linear Regression Model", lin_model), "with R-squared is 0.51")
[1] "Prediction Accuracy of Multiple Linear Regression Model is 86.7495824909906 % with R-squared is 0.51"
get_accuracy("Logistic Regression Model", log_model)
[1] "Prediction Accuracy of Logistic Regression Model is 86.7495824909906 %"
paste("Prediction Accuracy of k-NN Model with k=3 is", knn_accuracy, "%")
[1] "Prediction Accuracy of k-NN Model with k=3 is 94.2955084820251 %"
paste("The regression models have very similar close results. Though linear regression model is has MAD higher than R-squared, both fit the training data well. Compared to k-NN, they are best predictors of a succesful project though has 10% false positive bias.")
[1] "The regression models have very similar close results. Though linear regression model is has MAD higher than R-squared, both fit the training data well. Compared to k-NN, they are best predictors of a succesful project though has 10% false positive bias."
#PREDICT THE LIVE PROJECTS USING MULTIPLE LINEAR REGRESSION:
#predict the active projects that were live until last month
#format the data frame
active_df <- subset(raw_df, raw_df$ID %in% active_df$ID)
active_df$duration <- as.numeric(as.Date(active_df$deadline) - as.Date(active_df$launched))
active_df$encode_category <- as.numeric(active_df$main_category)
active_df$encode_country <- as.numeric(active_df$country)
#predict successfulness using linear regression
pred_values <- c()
for(i in 1:nrow(active_df)) {
pred_values <- c(pred_values, predict_value(active_df, lin_model, i))
}
pred_values <- ifelse(pred_values > 0.5,1,0)
active_df$predict_state <- pred_values
active_df